source("utils.R")Assemble environment and DOC data.
Join env and DOC
Load data
load("data/00.all_env.Rdata")
load("data/01.doc_data.Rdata")Join
df_all <- df_env %>%
left_join(df_doc %>% select(lon, lat, doc = doc_mean), by = join_by(lon, lat))Plot some maps
ggmap(df_env, var = "oxygen", type = "raster") +
geom_point(data = df_all %>% filter(!is.na(doc)), aes(x = lon, y = lat), size = 0.5)ggmap(df_all, var = "doc", type = "point") +
ggplot2::scale_colour_viridis_c(option = "A", trans = "log10", na.value = NA)Some data exploration
Response variable
ggplot(df_all) + geom_histogram(aes(x = doc), bins = 100)DOC distribution has a long-tail. Let’s try a log transformation.
ggplot(df_all) + geom_histogram(aes(x = log(doc)), bins = 100)This is better! Let’s indeed apply the log transformation.
df_all <- df_all %>% mutate(doc_log = log(doc))Note as we will be dropping pixels with any missing value in the predictors, the distribution of DOC values is likely to change further, and this is actually the case when prediction is performed using all environmental predictors (see next notebook).
Explanatory variables
Let’s now investigate the big tendencies in our dataset.
# Need to remove lon and lat and to scale because units differ between variables
df_pca <- df_all %>% select(c(lon, lat, doc, doc_log, everything()))
pca_all <- FactoMineR::PCA(df_pca, quanti.sup = 1:2, scale.unit = TRUE, graph = FALSE)Plot eigenvalues.
plot_eig(pca_all)Plot variables.
plot(pca_all, choix = "var", axes = c(1, 2), cex = 0.7)## Get coordinates of individuals
inds <- pca_all$ind$coord %>% as_tibble()
# Set nice names for columns
colnames(inds) <- str_c("dim", paste(c(1:ncol(inds))))
# And join with initial dataframe of objects
inds <- df_all %>% bind_cols(inds)
ggmap(inds, "dim1", type = "point", palette = div_pal)ggmap(inds, "dim2", type = "point", palette = div_pal)Distribution of DOC-annotated data VS overall env data
It is important that the distribution of env data used to predict DOC is representative of the overall environmental data that we are going to used to make predictions.
# List env variables
env_vars <- df_env %>% select(-c(lon, lat)) %>% colnames()
# Reshape tables of doc-annotated and overall env data
dist_doc_ann <- df_all %>%
drop_na(doc) %>%
select(all_of(env_vars)) %>%
mutate(type = "doc-annotated") %>%
pivot_longer(all_of(env_vars), names_to = "variable")
dist_env <- df_env %>%
select(all_of(env_vars)) %>%
mutate(type = "overall") %>%
pivot_longer(all_of(env_vars), names_to = "variable")
# Assemble and plot
bind_rows(dist_doc_ann, dist_env) %>%
drop_na() %>%
ggplot() +
geom_density(aes(x = value, colour = type)) +
facet_wrap(~variable, scales = "free")Distributions are not too different, this is good, it means that our model won’t extrapolate outside of the range it was trained on.
Save everything
Two datasets are saved:
one with both DOC and env data at locations where DOC is available: this will be used to train the model.
one with global map of env data, to predict DOC at the global scale
save(df_all, df_env, file = "data/02.all_data.Rdata")